̥— title: “African American Achievements” author: “Anirudh Jayaraman” date: “11/06/2020” output: html —

Loading possibly relevant libraries (that we can keep adding to)

library(tidyverse)
## -- Attaching packages --------------------------------------------- tidyverse 1.3.0 --
## v ggplot2 3.3.0     v purrr   0.3.4
## v tibble  3.0.1     v dplyr   1.0.0
## v tidyr   1.0.2     v stringr 1.4.0
## v readr   1.3.1     v forcats 0.5.0
## -- Conflicts ------------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
theme_set(new = theme_light())

Getting the data!

choose_how <- 1  # Set this to either 0 or 1

if(choose_how == 0){
  # Either read with Github csv urls ------------------------------------------
  firsts_url <- "https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2020/2020-06-09/firsts.csv"
  science_url <- "https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2020/2020-06-09/science.csv"
  
  firsts <- readr::read_csv(firsts_url)
  science <- readr::read_csv(science_url)
} else {
  # Or read in with tidytuesdayR package --------------------------------------
  # (https://github.com/thebioengineer/tidytuesdayR)
 
  choose_again <- 1  # Set this to either 0 or 1

  if(choose_again == 0){
    tuesdata <- tidytuesdayR::tt_load('2020-06-09')
  } else {
    tuesdata <- tidytuesdayR::tt_load(2020, week = 24)
  }
  
  firsts <- tuesdata$firsts
  science <- tuesdata$science
}
## --- Compiling #TidyTuesday Information for 2020-06-09 ----
## --- There are 2 files available ---
## --- Starting Download ---
## 
##  Downloading file 1 of 2: `firsts.csv`
##  Downloading file 2 of 2: `science.csv`
## --- Download complete ---
View(firsts)
View(science)

Firsts

firsts %>% 
  ggplot(mapping = aes(x = year, fill = category)) +
  geom_histogram(bins = 50)

firsts %>% 
  ggplot(mapping = aes(x = year, fill = category)) +
  geom_histogram(bins = 50) + 
  facet_wrap(~ category)

firsts %>% count(category, sort = TRUE)
## # A tibble: 8 x 2
##   category                 n
##   <chr>                <int>
## 1 Arts & Entertainment   107
## 2 Education & Science     87
## 3 Politics                82
## 4 Military                73
## 5 Social & Jobs           57
## 6 Sports                  38
## 7 Religion                21
## 8 Law                     14
firsts %>%
  count(category, sort = TRUE) %>%
  mutate(category = fct_reorder(category, n)) %>%
  ggplot(mapping = aes(x = n, y = category)) + 
  geom_col()

firsts %>% select(person)
## # A tibble: 479 x 1
##    person                                                                       
##    <chr>                                                                        
##  1 Gracia Real de Santa Teresa de Mose (later named Fort Mose) in Florida       
##  2 Jupiter Hammon (poem An Evening Thought                                      
##  3 Wentworth Cheswell, town constable in Newmarket, New Hampshire.[5]           
##  4 Phillis Wheatley (Poems on Various Subjects, Religious and Moral)[6]         
##  5 Silver Bluff Baptist Church, Aiken County, South Carolina[7][8][Note 1]      
##  6 Prince Hall                                                                  
##  7 the 1st Rhode Island Regiment[9]                                             
##  8 James Derham, who did not hold an M.D. degree.[10] (See also 1847 firsts.)   
##  9 Rev. Lemuel Haynes. He was ordained in the Congregational Church, which beca~
## 10 3,000 Black Loyalist slaves,  who had escaped to British lines during the Am~
## # ... with 469 more rows

Removing any text from the person column following [ or (

firsts %>%
  mutate(person = str_remove(person, pattern = "[\\(\\[].*"),
         person = str_trim(person)) %>%
  select(person)
## # A tibble: 479 x 1
##    person                                                                       
##    <chr>                                                                        
##  1 Gracia Real de Santa Teresa de Mose                                          
##  2 Jupiter Hammon                                                               
##  3 Wentworth Cheswell, town constable in Newmarket, New Hampshire.              
##  4 Phillis Wheatley                                                             
##  5 Silver Bluff Baptist Church, Aiken County, South Carolina                    
##  6 Prince Hall                                                                  
##  7 the 1st Rhode Island Regiment                                                
##  8 James Derham, who did not hold an M.D. degree.                               
##  9 Rev. Lemuel Haynes. He was ordained in the Congregational Church, which beca~
## 10 3,000 Black Loyalist slaves,  who had escaped to British lines during the Am~
## # ... with 469 more rows

Changing the Firsts dataset to reflect cleaned Person column

firsts <- firsts %>%
  mutate(person = str_remove(person, "[\\(\\[].*"),
         person = str_trim(person))
tuesdata
## Available datasets:
##  firsts 
##  science 
##  

Interactive Graphics

library(plotly)
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
library(glue)
## 
## Attaching package: 'glue'
## The following object is masked from 'package:dplyr':
## 
##     collapse

Attempting a timeline via plotly

g <- firsts %>%
  ggplot(mapping = aes(x = year, 
                       y = category, 
                       color = category,
                       text = glue("{year}, {accomplishment},\n{person}"))) + 
  geom_point() +
  theme(axis.text.y = element_blank(),
        axis.ticks.y = element_blank(),
        panel.grid.major.y = element_blank()) + 
  labs(title = "Timeline of some notable African-American achievements", 
       caption = "Source: https://en.wikipedia.org/wiki/List_of_African-American_firsts") 

ggplotly(g, tooltip = "text")

Science

science
## # A tibble: 120 x 7
##    name    birth death occupation_s    inventions_accompli~ references links    
##    <chr>   <dbl> <dbl> <chr>           <chr>                <chr>      <chr>    
##  1 Amos, ~  1918  2003 Microbiologist  First African-Ameri~ 6,         https://~
##  2 Alcorn~  1940    NA Physicist; inv~ Invented a method o~ 7,8,       https://~
##  3 Andrew~  1930  1998 Mathematician   Put forth the Andre~ 9,         https://~
##  4 Alexan~  1888  1958 Civil engineer  Responsible for the~ <NA>       https://~
##  5 Bailey~  1825  1918 Inventor        Folding bed          10,        https://~
##  6 Ball, ~  1892  1916 Chemist         Extracted chaulmoog~ 11,        https://~
##  7 Bannek~  1731  1806 Almanac author~ Constructed wooden ~ 12,        https://~
##  8 Banyag~  1947    NA Mathematician   Work on diffeomorph~ 13,        https://~
##  9 Bashen~  1957    NA Inventor; entr~ First African-Ameri~ 14,        https://~
## 10 Bath, ~  1942  2019 Ophthalmologist First African-Ameri~ 15,16,17,  https://~
## # ... with 110 more rows
science %>%
  count(occupation_s) %>%
  arrange(desc(n)) %>% head(n = 10)
## # A tibble: 10 x 2
##    occupation_s                    n
##    <chr>                       <int>
##  1 Inventor                       20
##  2 Chemist                         8
##  3 Computer scientist              6
##  4 Mathematician                   5
##  5 Physicist                       4
##  6 Computer engineer               2
##  7 Engineer; inventor              2
##  8 Linguist                        2
##  9 Mathematician; statistician     2
## 10 Psychologist                    2

Some occupations are list separated by semicolons. The data currently has 120 rows and 7 columns. Let’s expand the dataset based on one occupation instance in each row.

science %>%
  separate_rows(occupation_s, sep = ";") %>%
  mutate(occupation = str_to_title(occupation_s)) %>%
  count(occupation, sort = TRUE)
## # A tibble: 89 x 2
##    occupation               n
##    <chr>                <int>
##  1 "Inventor"              25
##  2 "Chemist"                9
##  3 " Inventor"              8
##  4 "Mathematician"          8
##  5 "Computer Scientist"     6
##  6 "Physicist"              6
##  7 " Social Scientist"      3
##  8 " Statistician"          3
##  9 "Engineer"               3
## 10 "Psychologist"           3
## # ... with 79 more rows
science %>%
  separate_rows(occupation_s, sep = ";")
## # A tibble: 164 x 7
##    name    birth death occupation_s  inventions_accomplis~ references links     
##    <chr>   <dbl> <dbl> <chr>         <chr>                 <chr>      <chr>     
##  1 Amos, ~  1918  2003 "Microbiolog~ First African-Americ~ 6,         https://e~
##  2 Alcorn~  1940    NA "Physicist"   Invented a method of~ 7,8,       https://e~
##  3 Alcorn~  1940    NA " inventor"   Invented a method of~ 7,8,       https://e~
##  4 Andrew~  1930  1998 "Mathematici~ Put forth the Andrew~ 9,         https://e~
##  5 Alexan~  1888  1958 "Civil engin~ Responsible for the ~ <NA>       https://e~
##  6 Bailey~  1825  1918 "Inventor"    Folding bed           10,        https://e~
##  7 Ball, ~  1892  1916 "Chemist"     Extracted chaulmoogr~ 11,        https://e~
##  8 Bannek~  1731  1806 "Almanac aut~ Constructed wooden c~ 12,        https://e~
##  9 Bannek~  1731  1806 " surveyor"   Constructed wooden c~ 12,        https://e~
## 10 Bannek~  1731  1806 " farmer"     Constructed wooden c~ 12,        https://e~
## # ... with 154 more rows

Filter the dataset for ‘scientists’ and ‘ians’ of any kind

science %>%
  separate_rows(occupation_s, sep = ";") %>%
  filter(str_detect(regex(pattern = "scientist", ignore_case = TRUE), 
                    string = occupation_s) | 
           str_detect(string = occupation_s, 
                      pattern = regex(".*ian")))
## # A tibble: 29 x 7
##    name    birth death occupation_s   inventions_accomplis~ references links    
##    <chr>   <dbl> <dbl> <chr>          <chr>                 <chr>      <chr>    
##  1 Andrew~  1930  1998 "Mathematicia~ Put forth the Andrew~ 9,         https://~
##  2 Banyag~  1947    NA "Mathematicia~ Work on diffeomorphi~ 13,        https://~
##  3 Bharuc~  1927  1985 "Mathematicia~ Probability theory a~ 24,        https://~
##  4 Bharuc~  1927  1985 " statisticia~ Probability theory a~ 24,        https://~
##  5 Blackw~  1919  2010 "Mathematicia~ First proposed the B~ 27,        https://~
##  6 Blackw~  1919  2010 " statisticia~ First proposed the B~ 27,        https://~
##  7 Bowman~  1923  2011 "Physician"    Pathologist and gene~ 35,36,     https://~
##  8 Chappe~  1872  1941 "Electrician"  Designed long-distan~ 54,55,56,  https://~
##  9 Chappe~  1925  2019 "Scientist an~ Valuable contributio~ <NA>       https://~
## 10 Dean, ~  1957    NA "Computer sci~ Led the team that de~ 58,59,60,  https://~
## # ... with 19 more rows
science %>% 
  pull(links)
##   [1] "https://en.wikipedia.org/wiki/Harold_Amos"                                     
##   [2] "https://en.wikipedia.org/wiki/George_Edward_Alcorn,_Jr."                       
##   [3] "https://en.wikipedia.org/wiki/James_J._Andrews_(mathematician)"                
##   [4] "https://en.wikipedia.org/wiki/Archie_Alexander"                                
##   [5] "https://en.wikipedia.org/wiki/Leonard_C._Bailey"                               
##   [6] "https://en.wikipedia.org/wiki/Alice_Augusta_Ball"                              
##   [7] "https://en.wikipedia.org/wiki/Benjamin_Banneker"                               
##   [8] "https://en.wikipedia.org/wiki/Augustin_Banyaga"                                
##   [9] "https://en.wikipedia.org/wiki/Janet_Emerson_Bashen"                            
##  [10] "https://en.wikipedia.org/wiki/Patricia_Bath"                                   
##  [11] "https://en.wikipedia.org/wiki/Andrew_Jackson_Beard"                            
##  [12] "https://en.wikipedia.org/wiki/Miriam_Benjamin"                                 
##  [13] "https://en.wikipedia.org/wiki/Leonidas_Berry"                                  
##  [14] "https://en.wikipedia.org/wiki/Albert_Turner_Bharucha-Reid"                     
##  [15] "https://en.wikipedia.org/wiki/Keith_Black_(surgeon)"                           
##  [16] "https://en.wikipedia.org/wiki/David_Blackwell"                                 
##  [17] "https://en.wikipedia.org/wiki/Henry_Blair_(inventor)"                          
##  [18] "https://en.wikipedia.org/wiki/Kwabena_Boahen"                                  
##  [19] "https://en.wikipedia.org/wiki/Sarah_Boone"                                     
##  [20] "https://en.wikipedia.org/wiki/Edward_Bouchet"                                  
##  [21] "https://en.wikipedia.org/wiki/James_E._Bowman"                                 
##  [22] "https://en.wikipedia.org/wiki/Otis_Boykin"                                     
##  [23] "https://en.wikipedia.org/wiki/St._Elmo_Brady"                                  
##  [24] "https://en.wikipedia.org/wiki/Herman_Branson"                                  
##  [25] "https://en.wikipedia.org/w/index.php?title=Oscar_E.Brown&action=edit&redlink=1"
##  [26] "https://en.wikipedia.org/wiki/Marie_Van_Brittan_Brown"                         
##  [27] "https://en.wikipedia.org/wiki/William_Warrick_Cardozo"                         
##  [28] "https://en.wikipedia.org/wiki/Ben_Carson"                                      
##  [29] "https://en.wikipedia.org/wiki/George_Robert_Carruthers"                        
##  [30] "https://en.wikipedia.org/wiki/George_Washington_Carver"                        
##  [31] "https://en.wikipedia.org/wiki/Charles_W._Chappelle"                            
##  [32] "https://en.wikipedia.org/wiki/Emmett_Chappelle"                                
##  [33] "https://en.wikipedia.org/wiki/Kenneth_and_Mamie_Clark"                         
##  [34] "https://en.wikipedia.org/wiki/Kenneth_and_Mamie_Clark"                         
##  [35] "https://en.wikipedia.org/wiki/David_Crosthwait"                                
##  [36] "https://en.wikipedia.org/w/index.php?title=Nick_Curtis&action=edit&redlink=1"  
##  [37] "https://en.wikipedia.org/wiki/John_Dabiri"                                     
##  [38] "https://en.wikipedia.org/wiki/Marie_Maynard_Daly"                              
##  [39] "https://en.wikipedia.org/wiki/Mark_Dean_(computer_scientist)"                  
##  [40] "https://en.wikipedia.org/wiki/Charles_R._Drew"                                 
##  [41] "https://en.wikipedia.org/wiki/Paul_Du_Chaillu"                                 
##  [42] "https://en.wikipedia.org/wiki/Annie_Easley"                                    
##  [43] "https://en.wikipedia.org/wiki/Clarence_Ellis_(computer_scientist)"             
##  [44] "https://en.wikipedia.org/wiki/Bisi_Ezerioha"                                   
##  [45] "https://en.wikipedia.org/wiki/Lloyd_Noel_Ferguson"                             
##  [46] "https://en.wikipedia.org/wiki/Roland_G._Fryer,_Jr."                            
##  [47] "https://en.wikipedia.org/wiki/Sylvester_James_Gates"                           
##  [48] "https://en.wikipedia.org/wiki/Sarah_E._Goode"                                  
##  [49] "https://en.wikipedia.org/wiki/Juan_E._Gilbert"                                 
##  [50] "https://en.wikipedia.org/wiki/George_Franklin_Grant"                           
##  [51] "https://en.wikipedia.org/wiki/Joseph_L._Graves"                                
##  [52] "https://en.wikipedia.org/wiki/Lisa_Green_(linguist)"                           
##  [53] "https://en.wikipedia.org/wiki/Kevin_Greenaugh"                                 
##  [54] "https://en.wikipedia.org/wiki/Bessie_Blount_Griffin"                           
##  [55] "https://en.wikipedia.org/wiki/Lloyd_Hall"                                      
##  [56] "https://en.wikipedia.org/wiki/James_Andrew_Harris"                             
##  [57] "https://en.wikipedia.org/wiki/Walter_Lincoln_Hawkins"                          
##  [58] "https://en.wikipedia.org/wiki/John_E._Hodge"                                   
##  [59] "https://en.wikipedia.org/wiki/Kerrie_Holley"                                   
##  [60] "https://en.wikipedia.org/wiki/Mary_Jackson_(engineer)"                         
##  [61] "https://en.wikipedia.org/wiki/Shirley_Ann_Jackson"                             
##  [62] "https://en.wikipedia.org/wiki/Erich_Jarvis"                                    
##  [63] "https://en.wikipedia.org/wiki/Thomas_L._Jennings"                              
##  [64] "https://en.wikipedia.org/wiki/Lonnie_Johnson_(inventor)"                       
##  [65] "https://en.wikipedia.org/wiki/Katherine_Johnson"                               
##  [66] "https://en.wikipedia.org/wiki/Frederick_McKinley_Jones"                        
##  [67] "https://en.wikipedia.org/wiki/Percy_Lavon_Julian"                              
##  [68] "https://en.wikipedia.org/wiki/Ernest_Everett_Just"                             
##  [69] "https://en.wikipedia.org/wiki/Rick_Kittles"                                    
##  [70] "https://en.wikipedia.org/wiki/Samuel_L._Kountz"                                
##  [71] "https://en.wikipedia.org/wiki/Lewis_Howard_Latimer"                            
##  [72] "https://en.wikipedia.org/wiki/Jerry_Lawson_(engineer)"                         
##  [73] "https://en.wikipedia.org/wiki/Raphael_Carl_Lee"                                
##  [74] "https://en.wikipedia.org/wiki/Beebe_Steven_Lynk"                               
##  [75] "https://en.wikipedia.org/wiki/Mary_Eliza_Mahoney"                              
##  [76] "https://en.wikipedia.org/wiki/Jan_Ernst_Matzeliger"                            
##  [77] "https://en.wikipedia.org/wiki/Henry_Cecil_McBay"                               
##  [78] "https://en.wikipedia.org/wiki/Elijah_McCoy"                                    
##  [79] "https://en.wikipedia.org/wiki/James_McLurkin"                                  
##  [80] "https://en.wikipedia.org/wiki/John_McWhorter"                                  
##  [81] "https://en.wikipedia.org/wiki/Ben_Montgomery"                                  
##  [82] "https://en.wikipedia.org/wiki/Willie_Hobbs_Moore"                              
##  [83] "https://en.wikipedia.org/wiki/Garrett_Morgan"                                  
##  [84] "https://en.wikipedia.org/wiki/Thomas_Mensah_(engineer)"                        
##  [85] "https://en.wikipedia.org/wiki/Alexander_Miles"                                 
##  [86] "https://en.wikipedia.org/wiki/Jerome_Nriagu"                                   
##  [87] "https://en.wikipedia.org/wiki/John_Ogbu"                                       
##  [88] "https://en.wikipedia.org/wiki/Kunle_Olukotun"                                  
##  [89] "https://en.wikipedia.org/wiki/Soni_Oyekan"                                     
##  [90] "https://en.wikipedia.org/wiki/Alice_H._Parker"                                 
##  [91] "https://en.wikipedia.org/wiki/Hildrus_Poindexter"                              
##  [92] "https://en.wikipedia.org/wiki/Arlie_Petters"                                   
##  [93] "https://en.wikipedia.org/wiki/Lloyd_Quarterman"                                
##  [94] "https://en.wikipedia.org/wiki/Earl_W._Renfroe"                                 
##  [95] "https://en.wikipedia.org/wiki/Norbert_Rillieux"                                
##  [96] "https://en.wikipedia.org/wiki/Larry_Robinson_(chemist)"                        
##  [97] "https://en.wikipedia.org/wiki/Archia_Ross"                                     
##  [98] "https://en.wikipedia.org/wiki/Jesse_Russell"                                   
##  [99] "https://en.wikipedia.org/wiki/Thomas_Sowell"                                   
## [100] "https://en.wikipedia.org/wiki/Claude_Steele"                                   
## [101] "https://en.wikipedia.org/wiki/Lee_Stiff"                                       
## [102] "https://en.wikipedia.org/wiki/Window_Snyder"                                   
## [103] "https://en.wikipedia.org/wiki/Lewis_Temple"                                    
## [104] "https://en.wikipedia.org/wiki/Vivien_Thomas"                                   
## [105] "https://en.wikipedia.org/wiki/Charles_Henry_Turner_(zoologist)"                
## [106] "https://en.wikipedia.org/wiki/Neil_deGrasse_Tyson"                             
## [107] "https://en.wikipedia.org/wiki/Dorothy_Vaughan"                                 
## [108] "https://en.wikipedia.org/wiki/Powtawche_Valerino"                              
## [109] "https://en.wikipedia.org/wiki/Arthur_B._C._Walker,_Jr."                        
## [110] "https://en.wikipedia.org/wiki/Madam_C._J._Walker"                              
## [111] "https://en.wikipedia.org/wiki/Warren_M._Washington"                            
## [112] "https://en.wikipedia.org/wiki/James_Edward_Maceo_West"                         
## [113] "https://en.wikipedia.org/wiki/J._Ernest_Wilkins,_Jr."                          
## [114] "https://en.wikipedia.org/wiki/Daniel_Hale_Williams"                            
## [115] "https://en.wikipedia.org/wiki/Scott_W._Williams"                               
## [116] "https://en.wikipedia.org/wiki/Walter_E._Williams"                              
## [117] "https://en.wikipedia.org/wiki/Granville_Woods"                                 
## [118] "https://en.wikipedia.org/wiki/Jane_C._Wright"                                  
## [119] "https://en.wikipedia.org/wiki/Louis_T._Wright"                                 
## [120] "https://en.wikipedia.org/wiki/Roger_Arliner_Young"
library(rvest)
## Loading required package: xml2
## 
## Attaching package: 'rvest'
## The following object is masked from 'package:purrr':
## 
##     pluck
## The following object is masked from 'package:readr':
## 
##     guess_encoding
read_html("https://en.wikipedia.org/wiki/David_Blackwell") %>%
  html_node(".vcard") %>%
  as.character
## [1] "<table class=\"infobox biography vcard\" style=\"width:22em\"><tbody>\n<tr><th colspan=\"2\" style=\"text-align:center;font-size:125%;font-weight:bold\"><div class=\"fn\" style=\"display:inline\">David Blackwell</div></th></tr>\n<tr><td colspan=\"2\" style=\"text-align:center\">\n<a href=\"/wiki/File:David_Blackwell_1999.jpeg\" class=\"image\"><img alt=\"David Blackwell 1999.jpeg\" src=\"//upload.wikimedia.org/wikipedia/commons/thumb/7/73/David_Blackwell_1999.jpeg/220px-David_Blackwell_1999.jpeg\" decoding=\"async\" width=\"220\" height=\"152\" srcset=\"//upload.wikimedia.org/wikipedia/commons/thumb/7/73/David_Blackwell_1999.jpeg/330px-David_Blackwell_1999.jpeg 1.5x, //upload.wikimedia.org/wikipedia/commons/7/73/David_Blackwell_1999.jpeg 2x\" data-file-width=\"400\" data-file-height=\"277\"></a><div>Blackwell in 1999</div>\n</td></tr>\n<tr>\n<th scope=\"row\">Born</th>\n<td>\n<div style=\"display:inline\" class=\"nickname\">David Harold Blackwell</div>\n<br><span style=\"display:none\">(<span class=\"bday\">1919-04-24</span>)</span>April 24, 1919<br><div style=\"display:inline\" class=\"birthplace\">\n<a href=\"/wiki/Centralia,_Illinois\" title=\"Centralia, Illinois\">Centralia, Illinois</a>, U.S.</div>\n</td>\n</tr>\n<tr>\n<th scope=\"row\">Died</th>\n<td>July 8, 2010<span style=\"display:none\">(2010-07-08)</span> (aged 91)<sup id=\"cite_ref-stl-post_1-0\" class=\"reference\"><a href=\"#cite_note-stl-post-1\">[1]</a></sup><br><div style=\"display:inline\" class=\"deathplace\">\n<a href=\"/wiki/Berkeley,_California\" title=\"Berkeley, California\">Berkeley, California</a>, U.S.</div>\n</td>\n</tr>\n<tr>\n<th scope=\"row\">Nationality</th>\n<td class=\"category\">American</td>\n</tr>\n<tr>\n<th scope=\"row\">Alma mater</th>\n<td>\n<a href=\"/wiki/University_of_Illinois_at_Urbana-Champaign\" class=\"mw-redirect\" title=\"University of Illinois at Urbana-Champaign\">University of Illinois at Urbana-Champaign</a> (BA, PhD)</td>\n</tr>\n<tr>\n<th scope=\"row\">Known for</th>\n<td>\n<a href=\"/wiki/Rao%E2%80%93Blackwell_theorem\" title=\"Rao–Blackwell theorem\">Rao–Blackwell theorem</a><br><a href=\"/wiki/Blackwell_channel\" title=\"Blackwell channel\">Blackwell channel</a><br><a href=\"/wiki/Arbitrarily_varying_channel\" title=\"Arbitrarily varying channel\">Arbitrarily varying channel</a><br><a href=\"/wiki/Determinacy\" title=\"Determinacy\">Games of imperfect information</a><br><a href=\"/wiki/Dirichlet_distribution\" title=\"Dirichlet distribution\">Dirichlet distribution</a><br><a href=\"/wiki/Bayesian_statistics\" title=\"Bayesian statistics\">Bayesian statistics</a><br><a href=\"/wiki/Mathematical_economics\" title=\"Mathematical economics\">Mathematical economics</a><br><a href=\"/wiki/Recursive_economics\" title=\"Recursive economics\">Recursive economics</a><br><a href=\"/wiki/Sequential_analysis\" title=\"Sequential analysis\">Sequential analysis</a>\n</td>\n</tr>\n<tr>\n<th scope=\"row\">Awards</th>\n<td>\n<a href=\"/wiki/Member_of_the_National_Academy_of_Sciences\" title=\"Member of the National Academy of Sciences\">Member of the National Academy of Sciences</a> (1965)<br><a href=\"/wiki/John_von_Neumann_Theory_Prize\" title=\"John von Neumann Theory Prize\">John von Neumann Theory Prize</a> (1979)<br><a href=\"/wiki/R._A._Fisher_Lectureship\" title=\"R. A. Fisher Lectureship\">R. A. Fisher Lectureship</a> (1986)</td>\n</tr>\n<tr><td colspan=\"2\" style=\"text-align:center\"><b>Scientific career</b></td></tr>\n<tr>\n<th scope=\"row\">Fields</th>\n<td class=\"category\">\n<a href=\"/wiki/Probability\" title=\"Probability\">Probability</a><br><a href=\"/wiki/Statistics\" title=\"Statistics\">Statistics</a><br><a href=\"/wiki/Logic\" title=\"Logic\">Logic</a><br><a href=\"/wiki/Game_theory\" title=\"Game theory\">Game theory</a><br><a href=\"/wiki/Dynamic_programming\" title=\"Dynamic programming\">Dynamic programming</a><sup id=\"cite_ref-gs_2-0\" class=\"reference\"><a href=\"#cite_note-gs-2\">[2]</a></sup>\n</td>\n</tr>\n<tr>\n<th scope=\"row\">Institutions</th>\n<td><a href=\"/wiki/University_of_California,_Berkeley\" title=\"University of California, Berkeley\">University of California, Berkeley</a></td>\n</tr>\n<tr>\n<th scope=\"row\"><a href=\"/wiki/Thesis\" title=\"Thesis\">Thesis</a></th>\n<td>\n<a rel=\"nofollow\" class=\"external text\" href=\"https://www.worldcat.org/oclc/493477066\"><i>Some properties of Markoff chains</i></a> <span style=\"font-size:97%;\">(1941)</span>\n</td>\n</tr>\n<tr>\n<th scope=\"row\"><a href=\"/wiki/Doctoral_advisor\" title=\"Doctoral advisor\">Doctoral advisor</a></th>\n<td>\n<a href=\"/wiki/Joseph_Leo_Doob\" class=\"mw-redirect\" title=\"Joseph Leo Doob\">Joseph Leo Doob</a><sup id=\"cite_ref-mathgene_3-0\" class=\"reference\"><a href=\"#cite_note-mathgene-3\">[3]</a></sup>\n</td>\n</tr>\n<tr>\n<th scope=\"row\">Notable students</th>\n<td>\n<a href=\"/wiki/Roger_J-B_Wets\" title=\"Roger J-B Wets\">Roger J-B Wets</a><br>Richard S. Bucy<sup id=\"cite_ref-mathgene_3-1\" class=\"reference\"><a href=\"#cite_note-mathgene-3\">[3]</a></sup>\n</td>\n</tr>\n<tr style=\"display:none\"><td colspan=\"2\">\n</td></tr>\n</tbody></table>\n"
read_html("https://en.wikipedia.org/wiki/David_Blackwell") %>%
  html_node(".vcard") %>%
  html_table()
##      David Blackwell
## 1  Blackwell in 1999
## 2               Born
## 3               Died
## 4        Nationality
## 5         Alma mater
## 6          Known for
## 7             Awards
## 8  Scientific career
## 9             Fields
## 10      Institutions
## 11            Thesis
## 12  Doctoral advisor
## 13  Notable students
## 14                  
##                                                                                                                                                                                         David Blackwell
## 1                                                                                                                                                                                     Blackwell in 1999
## 2                                                                                                                             David Harold Blackwell(1919-04-24)April 24, 1919Centralia, Illinois, U.S.
## 3                                                                                                                                       July 8, 2010(2010-07-08) (aged 91)[1]Berkeley, California, U.S.
## 4                                                                                                                                                                                              American
## 5                                                                                                                                                  University of Illinois at Urbana-Champaign (BA, PhD)
## 6  Rao–Blackwell theoremBlackwell channelArbitrarily varying channelGames of imperfect informationDirichlet distributionBayesian statisticsMathematical economicsRecursive economicsSequential analysis
## 7                                                                                  Member of the National Academy of Sciences (1965)John von Neumann Theory Prize (1979)R. A. Fisher Lectureship (1986)
## 8                                                                                                                                                                                     Scientific career
## 9                                                                                                                                           ProbabilityStatisticsLogicGame theoryDynamic programming[2]
## 10                                                                                                                                                                   University of California, Berkeley
## 11                                                                                                                                                             Some properties of Markoff chains (1941)
## 12                                                                                                                                                                                   Joseph Leo Doob[3]
## 13                                                                                                                                                                     Roger J-B WetsRichard S. Bucy[3]
## 14
read_html("https://en.wikipedia.org/wiki/David_Blackwell") %>%
  html_node(".vcard") %>%
  html_table() %>%
  set_names("key", "value") %>%
  as_tibble()
## # A tibble: 14 x 2
##    key              value                                                       
##    <chr>            <chr>                                                       
##  1 "Blackwell in 1~ "Blackwell in 1999"                                         
##  2 "Born"           "David Harold Blackwell(1919-04-24)April 24, 1919Centralia,~
##  3 "Died"           "July 8, 2010(2010-07-08) (aged 91)[1]Berkeley, California,~
##  4 "Nationality"    "American"                                                  
##  5 "Alma mater"     "University of Illinois at Urbana-Champaign (BA, PhD)"      
##  6 "Known for"      "Rao–Blackwell theoremBlackwell channelArbitrarily varying ~
##  7 "Awards"         "Member of the National Academy of Sciences (1965)John von ~
##  8 "Scientific car~ "Scientific career"                                         
##  9 "Fields"         "ProbabilityStatisticsLogicGame theoryDynamic programming[2~
## 10 "Institutions"   "University of California, Berkeley"                        
## 11 "Thesis"         "Some properties of Markoff chains (1941)"                  
## 12 "Doctoral advis~ "Joseph Leo Doob[3]"                                        
## 13 "Notable studen~ "Roger J-B WetsRichard S. Bucy[3]"                          
## 14 ""               ""
science %>%
  head
## # A tibble: 6 x 7
##   name    birth death occupation_s  inventions_accomplish~ references links     
##   <chr>   <dbl> <dbl> <chr>         <chr>                  <chr>      <chr>     
## 1 Amos, ~  1918  2003 Microbiologi~ First African-America~ 6,         https://e~
## 2 Alcorn~  1940    NA Physicist; i~ Invented a method of ~ 7,8,       https://e~
## 3 Andrew~  1930  1998 Mathematician Put forth the Andrews~ 9,         https://e~
## 4 Alexan~  1888  1958 Civil engine~ Responsible for the c~ <NA>       https://e~
## 5 Bailey~  1825  1918 Inventor      Folding bed            10,        https://e~
## 6 Ball, ~  1892  1916 Chemist       Extracted chaulmoogra~ 11,        https://e~

science_html now has a column named html, containing XML

science_html <- science %>%
  separate_rows(occupation_s, sep = ";") %>%
  mutate(html = map(links, possibly(.f = read_html, 
                                    otherwise = NULL, 
                                    quiet = FALSE)))
## Error: HTTP error 404.
## Error: HTTP error 404.
## Error: HTTP error 404.

Let’s pull the html column (which is a list in reality)

science_html %>% pull(html) %>% head
## [[1]]
## {html_document}
## <html class="client-nojs" lang="en" dir="ltr">
## [1] <head>\n<meta http-equiv="Content-Type" content="text/html; charset=UTF-8 ...
## [2] <body class="mediawiki ltr sitedir-ltr mw-hide-empty-elt ns-0 ns-subject  ...
## 
## [[2]]
## {html_document}
## <html class="client-nojs" lang="en" dir="ltr">
## [1] <head>\n<meta http-equiv="Content-Type" content="text/html; charset=UTF-8 ...
## [2] <body class="mediawiki ltr sitedir-ltr mw-hide-empty-elt ns-0 ns-subject  ...
## 
## [[3]]
## {html_document}
## <html class="client-nojs" lang="en" dir="ltr">
## [1] <head>\n<meta http-equiv="Content-Type" content="text/html; charset=UTF-8 ...
## [2] <body class="mediawiki ltr sitedir-ltr mw-hide-empty-elt ns-0 ns-subject  ...
## 
## [[4]]
## {html_document}
## <html class="client-nojs" lang="en" dir="ltr">
## [1] <head>\n<meta http-equiv="Content-Type" content="text/html; charset=UTF-8 ...
## [2] <body class="mediawiki ltr sitedir-ltr mw-hide-empty-elt ns-0 ns-subject  ...
## 
## [[5]]
## {html_document}
## <html class="client-nojs" lang="en" dir="ltr">
## [1] <head>\n<meta http-equiv="Content-Type" content="text/html; charset=UTF-8 ...
## [2] <body class="mediawiki ltr sitedir-ltr mw-hide-empty-elt ns-0 ns-subject  ...
## 
## [[6]]
## {html_document}
## <html class="client-nojs" lang="en" dir="ltr">
## [1] <head>\n<meta http-equiv="Content-Type" content="text/html; charset=UTF-8 ...
## [2] <body class="mediawiki ltr sitedir-ltr mw-hide-empty-elt ns-0 ns-subject  ...

Anonymous function

extract_infobox <- . %>%
  html_node(".vcard") %>%
  html_table(header = FALSE) %>%
  as.tibble()
extract_infobox
## Functional sequence with the following components:
## 
##  1. html_node(., ".vcard")
##  2. html_table(., header = FALSE)
##  3. as.tibble(.)
## 
## Use 'functions' to extract the individual functions.

Applying this function to science_html

science_html %>% 
  glimpse
## Rows: 164
## Columns: 8
## $ name                       <chr> "Amos, Harold", "Alcorn, George Edward, ...
## $ birth                      <dbl> 1918, 1940, 1940, 1930, 1888, 1825, 1892...
## $ death                      <dbl> 2003, NA, NA, 1998, 1958, 1918, 1916, 18...
## $ occupation_s               <chr> "Microbiologist", "Physicist", " invento...
## $ inventions_accomplishments <chr> "First African-American department chair...
## $ references                 <chr> "6,", "7,8,", "7,8,", "9,", NA, "10,", "...
## $ links                      <chr> "https://en.wikipedia.org/wiki/Harold_Am...
## $ html                       <list> [<html class="client-nojs" lang="en" di...
science_html %>%
  mutate(infobox = map(html, possibly(.f = extract_infobox, 
                                   otherwise = NULL, 
                                   quiet = TRUE)))
## Warning: `as.tibble()` is deprecated as of tibble 2.0.0.
## Please use `as_tibble()` instead.
## The signature and semantics have changed, see `?as_tibble`.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.
## # A tibble: 164 x 9
##    name  birth death occupation_s inventions_acco~ references links html 
##    <chr> <dbl> <dbl> <chr>        <chr>            <chr>      <chr> <lis>
##  1 Amos~  1918  2003 "Microbiolo~ First African-A~ 6,         http~ <xml~
##  2 Alco~  1940    NA "Physicist"  Invented a meth~ 7,8,       http~ <xml~
##  3 Alco~  1940    NA " inventor"  Invented a meth~ 7,8,       http~ <xml~
##  4 Andr~  1930  1998 "Mathematic~ Put forth the A~ 9,         http~ <xml~
##  5 Alex~  1888  1958 "Civil engi~ Responsible for~ <NA>       http~ <xml~
##  6 Bail~  1825  1918 "Inventor"   Folding bed      10,        http~ <xml~
##  7 Ball~  1892  1916 "Chemist"    Extracted chaul~ 11,        http~ <xml~
##  8 Bann~  1731  1806 "Almanac au~ Constructed woo~ 12,        http~ <xml~
##  9 Bann~  1731  1806 " surveyor"  Constructed woo~ 12,        http~ <xml~
## 10 Bann~  1731  1806 " farmer"    Constructed woo~ 12,        http~ <xml~
## # ... with 154 more rows, and 1 more variable: infobox <list>
science_html %>%
  mutate(infobox = map(html, possibly(.f = extract_infobox, 
                                   otherwise = NULL, 
                                   quiet = TRUE))) %>%
  glimpse
## Rows: 164
## Columns: 9
## $ name                       <chr> "Amos, Harold", "Alcorn, George Edward, ...
## $ birth                      <dbl> 1918, 1940, 1940, 1930, 1888, 1825, 1892...
## $ death                      <dbl> 2003, NA, NA, 1998, 1958, 1918, 1916, 18...
## $ occupation_s               <chr> "Microbiologist", "Physicist", " invento...
## $ inventions_accomplishments <chr> "First African-American department chair...
## $ references                 <chr> "6,", "7,8,", "7,8,", "9,", NA, "10,", "...
## $ links                      <chr> "https://en.wikipedia.org/wiki/Harold_Am...
## $ html                       <list> [<html class="client-nojs" lang="en" di...
## $ infobox                    <list> [NULL, NULL, NULL, NULL, <tbl_df[13 x 2...
science_html %>%
  mutate(infobox = map(html, possibly(.f = extract_infobox, 
                                   otherwise = NULL, 
                                   quiet = TRUE))) %>%
  filter(!map_lgl(infobox, is.null))
## # A tibble: 127 x 9
##    name  birth death occupation_s inventions_acco~ references links html 
##    <chr> <dbl> <dbl> <chr>        <chr>            <chr>      <chr> <lis>
##  1 Alex~  1888  1958 "Civil engi~ Responsible for~ <NA>       http~ <xml~
##  2 Ball~  1892  1916 "Chemist"    Extracted chaul~ 11,        http~ <xml~
##  3 Bann~  1731  1806 "Almanac au~ Constructed woo~ 12,        http~ <xml~
##  4 Bann~  1731  1806 " surveyor"  Constructed woo~ 12,        http~ <xml~
##  5 Bann~  1731  1806 " farmer"    Constructed woo~ 12,        http~ <xml~
##  6 Bany~  1947    NA "Mathematic~ Work on diffeom~ 13,        http~ <xml~
##  7 Bash~  1957    NA "Inventor"   First African-A~ 14,        http~ <xml~
##  8 Bash~  1957    NA " entrepren~ First African-A~ 14,        http~ <xml~
##  9 Bash~  1957    NA " professio~ First African-A~ 14,        http~ <xml~
## 10 Bath~  1942  2019 "Ophthalmol~ First African-A~ 15,16,17,  http~ <xml~
## # ... with 117 more rows, and 1 more variable: infobox <list>